library(tidyverse)
library(ggraph)
library(tidygraph)
library(igraph)
library(data.table)
library(tidytable)
library(statnet)
knitr::opts_chunk$set(message=FALSE, warning=FALSE)
gkl_actors = fread('gkl_actors_full.csv')
all_actors = fread('../estc-data-unified/estc-actors-unified/actors.tsv')
Bipartite graph, projected to actor links:
g = gkl_actors %>% filter(J_divergence<.42) %>%
filter(!is.na(actor_id)) %>%
group_by(estc_id, actor_id) %>%
summarise(diverg = mean(J_divergence)) %>%
graph_from_data_frame( directed=FALSE)
V(g)$type <- bipartite_mapping(g)$type
#gg <- unipartite_projection_attr(gkl_actors_graph, "diverg", FALSE)
gg = bipartite.projection(g)
gg[[2]] %>%
as_tbl_graph() %>%
activate(edges) %>%
activate(nodes) %>%
mutate(wtd_degree = centrality_degree(weights = weight)) %>%
as_tibble() %>%
arrange(desc(wtd_degree)) %>%
rename(actor_id = name) %>%
left_join(all_actors %>%
select(actor_id, name_unified, viaf_link), by = 'actor_id')
gg[[2]] %>%
as_tbl_graph() %>%
activate(edges) %>%
activate(nodes) %>%
mutate(wtd_degree = centrality_degree(weights = weight)) %>%
as_tibble() %>%
count(wtd_degree) %>%
ggplot() + geom_point(aes(wtd_degree,n)) + scale_x_log10()+ scale_y_log10()
Not normally distributed but not scale-free (on a power law) either.
(The sum of all the shortest paths between every pair of nodes which pass through that node)
gg[[2]] %>%
as_tbl_graph() %>%
activate(edges) %>%
activate(nodes) %>%
mutate(betweenness = centrality_betweenness()) %>%
as_tibble() %>%
arrange(desc(betweenness)) %>%
rename(actor_id = name) %>%
left_join(all_actors %>%
select(actor_id, name_unified, viaf_link), by = 'actor_id')
These two metrics are often related:
total_pub_counts = gkl_actors %>% filter(J_divergence<.42) %>% count(actor_id) %>% filter(!is.na(actor_id))
gg[[2]] %>%
as_tbl_graph() %>%
activate(edges) %>%
activate(nodes) %>%
mutate(betweenness = centrality_betweenness()) %>%
mutate(degree = centrality_degree(weights = weight)) %>%
as_tibble() %>%
mutate(betweenness_rank = rank(-betweenness))%>%
mutate(degree_rank = rank(-degree)) %>%
left_join(total_pub_counts, by = c('name' = 'actor_id')) %>%
ggplot(aes(betweenness_rank, degree_rank, size = n)) +
geom_rect(aes(xmin = 0, xmax = 100, ymin = 50, ymax = 500), fill = 'red', color = 'black',size = .1, pch = 21, alpha = .1) +
geom_point(alpha = .9, pch = 21)+ scale_size_area() + theme_bw()
Plotting both and looking for outliers shows nodes with ‘surprisingly’ high betweenness rank considering their degree (looking in the highlighted area for a start) can find interesting ‘bridges’, not important in their own right but holding separate parts of the network together:
p = gg[[2]] %>%
as_tbl_graph() %>%
activate(edges) %>%
activate(nodes) %>%
mutate(betweenness = centrality_betweenness()) %>%
mutate(degree = centrality_degree(weights = weight)) %>%
as_tibble()%>%
rename(actor_id = name) %>%
left_join(all_actors %>%
select(actor_id, name_unified, viaf_link), by = 'actor_id') %>%
mutate(betweenness_rank = rank(-betweenness))%>%
mutate(degree_rank = rank(-degree)) %>%
left_join(total_pub_counts, by = c('actor_id' = 'actor_id')) %>%
ggplot(aes(betweenness_rank, degree_rank, size = n, text = name_unified)) +
geom_point(alpha = .5) + scale_size_area()
plotly::ggplotly(p)
For instance David Niven (16th for betweenness, 425th for degree)
niven_books = gkl_actors %>% filter(actor_id == 'davidniven_0') %>% pull(estc_id)
gkl_actors %>%
filter(estc_id %in% niven_books & J_divergence <.42)
Who did they work with?
g = gkl_actors %>% filter(J_divergence<.42) %>%
filter(!is.na(actor_id)) %>%
group_by(estc_id, actor_id) %>%
summarise(diverg = mean(J_divergence)) %>%
graph_from_data_frame( directed=FALSE)
V(g)$type <- bipartite_mapping(g)$type
gg = bipartite.projection(g)
actor_net = gg[[2]]
names(neighbors(actor_net, 'davidniven_0')) %>% as_tibble()%>%
left_join(all_actors %>%
select(actor_id, name_unified, viaf_link), by = c('value' = 'actor_id'))
H.D. Symonds (19th for betweenness, 169th for degree):
symonds_books = gkl_actors %>% filter(actor_id == '62707926') %>% pull(estc_id)
gkl_actors %>%
filter(estc_id %in% symonds_books & J_divergence <.42) %>%
filter(!is.na(actor_id)) %>%
pull(actor_id)
## [1] "bbti_34706" "bbti_58231" "62707926" "62707926"
## [5] "bbti_64114" "messrichardson_0" "bbti_57604" "62707926"
## [9] "24680588" "bbti_22175" "62707926" "bbti_58231"
## [13] "16320887" "62707926" "bbti_5526" "bbti_77753"
## [17] "39231857" "62707926" "bbti_109535" "peterhill_0"
## [21] "17068965" "62707926" "bbti_97091" "bbti_72631"
## [25] "westandhughes_1" "24680588" "messrichardson_0" "bbti_22173"
## [29] "62707926" "62707926" "bbti_45088" "62707926"
## [33] "bbti_5155" "62707926" "bbti_97091" "bbti_34268"
## [37] "62707926" "bbti_34706" "bbti_58231" "62707926"
## [41] "62707926" "128307380" "bbti_97091" "bbti_59971"
## [45] "bbti_109535" "peterhill_0" "17068965" "62707926"
## [49] "bbti_97091" "bbti_72631" "westandhughes_1" "bbti_59971"
## [53] "bbti_77744" "bbti_62208" "71631229" "bbti_34268"
## [57] "62707926"
names(neighbors(actor_net, '62707926')) %>%
as_tibble()%>%
left_join(all_actors %>%
select(actor_id, name_unified, viaf_link), by = c('value' = 'actor_id'))
J. Duncan and Son (22nd for betweenness, 397 for degree):
duncan_books = gkl_actors %>% filter(actor_id == 'jduncanandson_0') %>% pull(estc_id)
gkl_actors %>%
filter(estc_id %in% duncan_books & J_divergence <.42)
names(neighbors(actor_net, 'jduncanandson_0')) %>% as_tibble() %>%
left_join(all_actors %>%
select(actor_id, name_unified, viaf_link), by = c('value' = 'actor_id'))
(Scores a node’s centrality based on its connections to other important nodes). Might suggest book trade actors who were influential because of their connections, or because they ‘had the ear’ of important individuals.
actor_net %>%
as_tbl_graph() %>%
mutate(eigen = centrality_eigen(weights = weight)) %>%
as_tibble() %>% arrange(desc(eigen))%>%
left_join(all_actors %>%
select(actor_id, name_unified, viaf_link), by = c('name' = 'actor_id'))
Some changes from highest degree e.g William Strahan is 16th highest by degree but 6th in eigenvector centrality.
actor_net %>% as_tbl_graph() %>% activate(edges)%>%
mutate(to_name = .N()$name[to],
from_name = .N()$name[from]) %>%
as_tibble() %>%
select(from = from_name, to = to_name, weight) %>%
arrange(desc(weight)) %>%
left_join(all_actors %>% select(name_unified, actor_id), by = c('from' = 'actor_id'))%>%
left_join(all_actors%>% select(name_unified, actor_id), by = c('to' = 'actor_id'))
We could look at overlapping works for these pairs e.g:
c_works = gkl_actors %>% filter(J_divergence <.42& actor_id == '18758830') %>% pull(estc_id)
s_works = gkl_actors %>% filter(J_divergence <.42& actor_id == '39467138')%>% pull(estc_id)
intersect(c_works, s_works) %>% as_tibble() %>%
left_join(gkl_actors, by = c('value' = 'estc_id')) %>% filter(actor_id %in% c('18758830', '39467138'))
Calculate degree scores for networks consisting of one year of data:
get_yearly_stats = function(df){
g = df %>% filter(J_divergence<.42) %>%
filter(!is.na(actor_id)) %>%
group_by(estc_id, actor_id) %>%
summarise(diverg = mean(J_divergence)) %>%
graph_from_data_frame( directed=FALSE)
V(g)$type <- bipartite_mapping(g)$type
gg = bipartite.projection(g)
actor_net = gg[[2]]
stats = actor_net %>% as_tbl_graph() %>%
mutate(degree = centrality_degree(weights = weight)) %>%
mutate(between = centrality_betweenness()) %>%
mutate(louvain = group_louvain(weights = weight)) %>%
as_tibble()
stats
}
list_of_dfs = list()
for(i in 1700:1800){
list_of_dfs[[as.character(i)]] = gkl_actors %>%
filter(publication_year == i)
}
results = map(list_of_dfs,
possibly(get_yearly_stats, otherwise = NA_character_) )
na.omit.list <- function(y) { return(y[!sapply(y, function(x) all(is.na(x)))]) }
results = rbindlist(results %>% na.omit.list, idcol = 'year')
Sum of degree scores by year:
results %>% mutate(year = as.numeric(year)) %>%
count(year, wt = degree) %>%
ggplot() + geom_col(aes(year, n))
Interactive exploratory map of communities found for all books with <.42 divergence: Labels are sized by degree score. Filtered to edges with a weight for more than 1.
g = actor_net %>% as_tbl_graph() %>%
mutate(louvain = group_louvain(weights = weight)) %>%
mutate(color = louvain)
filtered_g = g %>%
activate(edges) %>% filter(weight>1) %>%
activate(nodes) %>%
mutate(degree = centrality_degree(mode = 'all', weights= weight)) %>% filter(degree>0) %>% left_join(all_actors, by = c('name' = 'actor_id')) %>%
mutate(actor_id = name) %>%
mutate(name = paste0(name_unified, " (", name, ")")) %>% mutate(size =5 ) %>%
mutate(font.size =sqrt(degree))
visNetwork::visIgraph(filtered_g, layout = 'layout_with_kk', physics = T)%>%
visNetwork::visEdges(width = .01, color = list(opacity = .3))%>%
visNetwork::visOptions(selectedBy = "louvain")
Looks like two ‘core’ communities, community 1 with Lowndes, Fauldner etc. I think some of these are generational communities - because there are father/son pairs in different communities.
To check:
comms = g %>% filter(louvain %in% 1:10) %>%as_tibble()
gkl_actors %>% left_join(comms, by = c('actor_id' = 'name')) %>%
count(louvain, publication_year) %>% filter(!is.na(louvain)) %>%
ggplot() + geom_col(aes(x = publication_year, y =n, fill = as.factor(louvain)))
Might be more meaningful to divide the communities by 10/20 years.
Not all are just temporal. For example community 3 almost complete cut off, except for connection through Luke White to John Debrett. These are Dublin BT actors:
g %>%
filter(louvain ==3) %>%
pull(name) %>% as_tibble() %>%
inner_join(gkl_actors, by = c('value' = 'actor_id')) %>% count(publication_place)
What is the divergence profile of books worked on by actors in these communities?
louvain_df = g %>% as_tibble() %>% select(name, louvain)
gkl_actors %>%
left_join(louvain_df, by = c('actor_id'= 'name')) %>%
filter(louvain %in% 1:15) %>%
ggplot() +
geom_density(aes(J_divergence)) +
facet_wrap(~louvain, ncol = 3)
What authors did they publish on?
gkl_actors %>% filter(J_divergence<.42) %>%
left_join(louvain_df, by = c('actor_id'= 'name')) %>%
filter(louvain %in% 1:10) %>%
count(louvain, author) %>%
arrange(desc(n)) %>%
filter(!is.na(author)) %>%
group_by(louvain) %>% top_n(10, wt = n) %>%
summarise(authors = paste0(author, " (", n, ")", collapse = "; "))
What about the j_divergence profiles of the entire network?
all_g = gkl_actors %>%
filter(!is.na(actor_id)) %>%
group_by(estc_id, actor_id) %>%
summarise(diverg = mean(J_divergence)) %>%
graph_from_data_frame( directed=FALSE)
V(all_g)$type <- bipartite_mapping(all_g)$type
#gg <- unipartite_projection_attr(gkl_actors_graph, "diverg", FALSE)
all_gg = bipartite.projection(all_g)
all_gg_louvain_w = all_gg[[2]] %>%
as_tbl_graph() %>%
mutate(louvain = group_louvain(weights = weight)) %>%
as_tibble()
gkl_actors %>% left_join(all_gg_louvain_w, by = c('actor_id' = 'name')) %>%
filter(louvain %in% 1:15) %>%
ggplot() +
geom_histogram(aes(J_divergence), binwidth = .01) +
facet_wrap(~louvain, ncol = 3, scales = 'free_y')
degree_scores = all_gg[[2]] %>%
as_tbl_graph() %>%
mutate(degree = centrality_degree(mode = 'all', weights = weight)) %>%
as_tibble()
edges = gkl_actors %>%
mutate(estc_id_with_j = paste0(estc_id, " (", round(J_divergence,3), ")")) %>% select(actor_id, estc_id,estc_id_with_j)
nodes = all_gg[[2]] %>% as_tbl_graph() %>% as_tibble() %>% mutate(id = 1:nrow(.))
multigraph_edges = all_gg[[2]] %>%
as_tbl_graph() %>% activate(edges) %>%
as_tibble() %>%
left_join(nodes, by = c('from' = 'id')) %>%
left_join(nodes, by = c('to' = 'id')) %>%
left_join(edges, by =c('name.x' = 'actor_id'))%>%
left_join(edges, by =c('name.y' = 'actor_id')) %>%
filter(estc_id.x == estc_id.y) %>%
select(from, to, name.x, name.y, estc_id.x, estc_id_with_j.x) %>%
left_join(gkl_actors %>%
distinct(estc_id, short_title), by = c('estc_id.x' = 'estc_id')) %>%
mutate(title = paste0(short_title, " (", estc_id_with_j.x, ")"))%>%
left_join(all_actors %>%
select(actor_id, name_unified), by = c('name.x' = 'actor_id'))%>%
left_join(all_actors %>%
select(actor_id, name_unified), by = c('name.y' = 'actor_id'))%>%
mutate(name.x = paste0(name_unified.x, " (", name.x, ")")) %>%
mutate(name.y = paste0(name_unified.y, " (", name.y, ")"))
works_shared = multigraph_edges %>% group_by(from, to, name.x, name.y) %>%
summarise(works = paste0(title, collapse = '; '))
filtered_g = all_gg[[2]] %>%
as_tbl_graph()%>%
activate(edges) %>%
left_join(works_shared, by = c('from', 'to')) %>%
mutate(title = paste0("<b>Actor1: </b>", name.x, "<br><b>Actor2: </b>", name.y, "<br><b>WORKS: </b><br>", works)) %>%
activate(nodes) %>%
mutate(louvain = group_louvain(weights = weight)) %>%
mutate(color = louvain) %>%
activate(edges) %>%
filter(weight>3) %>%
activate(nodes) %>%
mutate(degree = centrality_degree(weights = weight)) %>%
filter(degree>0) %>% mutate(size = 5) %>% filter(louvain %in% 1:30) %>% activate(edges) %>% mutate(width = sqrt(weight))
visNetwork::visIgraph(filtered_g, layout = 'layout_with_fr', physics = F)%>%
visNetwork::visEdges(width = .5, color = list(opacity = .7))%>%
visNetwork::visOptions(selectedBy = "louvain")
This table lists works and actors for each community (taken from all the gkl_actors data).
The works column shows the twenty most frequent works worked on by actors in each community. The first number in parentheses is the j_divergence score, and the second the number of times that work is found. Editions have not been combined because they will have slightly different j_divergence scores.
The actors column is the top most significant actors by degree score, which is included in parentheses.
degree_scores = all_gg[[2]] %>%
as_tbl_graph() %>%
mutate(degree = centrality_degree(mode = 'all', weights = weight)) %>%
as_tibble()
actors = all_gg_louvain_w %>%
filter(louvain %in% 1:30) %>%
left_join(degree_scores) %>% arrange(desc(degree)) %>%
left_join(all_actors, by = c('name' = 'actor_id')) %>%
mutate(name = paste0(name_unified, " (", name, ") (", degree, ")")) %>%
group_by(louvain) %>% top_n(20, degree) %>%
summarise(names = paste0(name, collapse = '; '))
works = all_gg_louvain_w %>% filter(louvain %in% 1:30) %>%
left_join(gkl_actors, by = c('name' = 'actor_id')) %>%
count(louvain, estc_id, short_title) %>%
left_join(gkl_actors %>% distinct(estc_id, .keep_all = T) %>% select(estc_id, J_divergence)) %>% filter(J_divergence <.42)%>%
mutate(work_with_count = paste0(short_title, " (", n, ")", " (", round(J_divergence,4), ")"))%>%
arrange(-n) %>%
group_by(louvain) %>%
top_n(20, n) %>%
summarise(works = paste0(unique(work_with_count), collapse = '; ')) %>% left_join(actors, by = 'louvain')%>% DT::datatable()
works
Communities in blocks of 10 years:
get_yearly_stats = function(df){
g = df %>% filter(J_divergence<.42) %>%
filter(!is.na(actor_id)) %>%
group_by(estc_id, actor_id) %>%
summarise(diverg = mean(J_divergence)) %>%
graph_from_data_frame( directed=FALSE)
V(g)$type <- bipartite_mapping(g)$type
gg = bipartite.projection(g)
actor_net = gg[[2]]
stats = actor_net %>% as_tbl_graph() %>%
mutate(degree = centrality_degree(weights = weight)) %>%
mutate(between = centrality_betweenness()) %>%
mutate(louvain = group_louvain(weights = weight)) %>%
as_tibble()
stats
}
list_of_dfs_10 = list()
for(i in seq(1700, 1800, 10)){
list_of_dfs_10[[as.character(i)]] = gkl_actors %>%
filter(publication_decade == i)
}
results = map(list_of_dfs_10,
possibly(get_yearly_stats, otherwise = NA_character_) )
na.omit.list <- function(y) { return(y[!sapply(y, function(x) all(is.na(x)))]) }
results = rbindlist(results %>% na.omit.list, idcol = 'year')
So for example 1780s:
gkl_actors %>% filter(publication_decade == 1750)%>%
inner_join(results %>% filter(year == 1750), by =c('actor_id' = 'name'))%>%
filter(louvain %in% 1:15) %>%
ggplot() +
geom_density(aes(J_divergence)) +
facet_wrap(~louvain, ncol = 3)
Use the counts of over and under-threshold books worked on as covariate to predict edge ties in the whole network:
df = gkl_actors %>%
filter(!is.na(actor_id)) %>%
distinct(actor_id, estc_id)
j_count = gkl_actors %>%
filter(J_divergence<.42) %>%
filter(!is.na(actor_id)) %>%
count(actor_id, name = 'j_count')
not_j_count = gkl_actors %>%
filter(J_divergence>=.42) %>%
filter(!is.na(actor_id)) %>%
count(actor_id, name = 'not_j_count')
g = df%>%
graph_from_data_frame( directed=FALSE)
V(g)$type <- bipartite_mapping(g)$type
gg = bipartite.projection(g)
books_net = gg[[2]] %>%
as_tbl_graph() %>%
left_join(j_count,
by = c('name' = 'actor_id')) %>%
left_join(not_j_count,
by = c('name' = 'actor_id')) %>%
mutate(j_count = ifelse(is.na(j_count), 0, j_count)) %>%
mutate(not_j_count = ifelse(is.na(not_j_count), 0, not_j_count))
stats_g_net = intergraph::asNetwork(books_net)
m1 <- ergm(stats_g_net ~ edges + nodecov('j_count'))
summary(m1)
## Call:
## ergm(formula = stats_g_net ~ edges + nodecov("j_count"))
##
## Iterations: 9 out of 20
##
## Monte Carlo MLE Results:
## Estimate Std. Error MCMC % z value Pr(>|z|)
## edges -6.6869474 0.0058794 0 -1137.3 <1e-04 ***
## nodecov.j_count 0.0492235 0.0002061 0 238.8 <1e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Null Deviance: 31175779 on 22488571 degrees of freedom
## Residual Deviance: 472283 on 22488569 degrees of freedom
##
## AIC: 472287 BIC: 472317 (Smaller is better.)
Getting the estimate by decade:
results = list()
for(i in seq(1700, 1800, 10)){
df = gkl_actors %>% filter(publication_decade== i) %>%
filter(!is.na(actor_id)) %>%
distinct(actor_id, estc_id)
j_count = gkl_actors %>% filter(publication_decade== i)%>%
filter(J_divergence<.42) %>%
filter(!is.na(actor_id)) %>%
count(actor_id, name = 'j_count')
not_j_count = gkl_actors %>% filter(publication_decade== i) %>%
filter(J_divergence>=.42) %>%
filter(!is.na(actor_id)) %>%
count(actor_id, name = 'not_j_count')
g = df%>%
graph_from_data_frame( directed=FALSE)
V(g)$type <- bipartite_mapping(g)$type
gg = bipartite.projection(g)
books_net = gg[[2]] %>%
as_tbl_graph() %>%
left_join(j_count,
by = c('name' = 'actor_id')) %>%
left_join(not_j_count,
by = c('name' = 'actor_id')) %>%
mutate(j_count = ifelse(is.na(j_count), 0, j_count)) %>%
mutate(not_j_count = ifelse(is.na(not_j_count), 0, not_j_count))
stats_g_net = intergraph::asNetwork(books_net)
m1 <- ergm(stats_g_net ~ edges + nodecov('j_count'))
r = m1$MCMCtheta %>% as_tibble()
r$type = c('edges', 'j')
r$year = i
results[[as.character(i)]] = r
}
results = rbindlist(results)
results %>%
filter(type =='j') %>%
ggplot() + geom_col(aes(year, value))
What other factors influence edge ties?